Loading Packages & Initialization

In [2]:
rm(list=ls())

library(data.table)
library(tidyverse)
library(rJava)
library(RNetLogo)

library(lhs) # For maximin Latin hypercube sampling
library(ggplot2)
library(plotly) # For beautiful plotting
library(caret)
library(randomForest)
library(factoextra)
library(e1071)
library(TSrepr) # for evaluating predictive power

require(gridExtra)

options(warn = -1)
In [3]:
# Select if data generation is wanted
GenerateTTData <- 0
In [4]:
Is_Headless <- 1
nl.model <- "Segregation_OneShot"

nl.path <- "C:/Program Files/NetLogo 6.0.4/app"
model.path <- paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/",nl.model,".nlogo")


if (Is_Headless == 0){
    NLStart(nl.path, gui = TRUE,nl.jarname='netlogo-6.0.4.jar')
    NLLoadModel (model.path)
    } else {
    NLStart(nl.path, gui = FALSE,nl.jarname='netlogo-6.0.4.jar', nl.obj = nl.model)
    NLLoadModel (model.path, nl.obj = nl.model)
    
    #NLStart(nl.path, gui = FALSE,nl.jarname='netlogo-6.0.4.jar', nl.obj = nl.model)
    #NLLoadModel (model.path, nl.obj = nl.model )
    }

Model Parameters & Functions

Set model parameters

In [5]:
## Set model parameters
 # Number of replications for each instance
nofrep = 3   

# Number of iterations
iteration_budget = 21
 # order feature names according to their definition order in run_model
feature_names = c("density","%-similar-wanted")  
 # 
output_name = c("percent-similar")

 # Number of input parameters of the agent-based model
nofparams = length(feature_names)      

# set RF parameters
ntree = 120
mtry = 2

Set user parameters

In [6]:
error_type = "RMSE" # MAPE, BIAS

# choose the uncertainty measure
selection_metric <- "sd" #, "range" 

unlabeled_ins = 700 
test_ins = 400
train_ins_oneshot = 700
train_ins_Ad = 200

# Set selection parameters
selected_ins = 25 #nofinstancesWillbeSelected in each step

# Set elimination parameters
h <- 1 # number of variables eliminated in each step

Define functions

run_model

In [7]:
#run_model <- function(feature_names,feature_values){ # both should be in character list format
run_model <- function(feature_values){ # both should be in character list format

    
    k = length(feature_names)    
    for(i in 1:k){
        NLCommand(paste0("set ",feature_names[i]," ",feature_values[i]), nl.obj = nl.model)      
    }
    NLCommand("setup", nl.obj = nl.model)
    NLDoCommand(100, "go", nl.obj = nl.model) 
    result <- NLReport(output_name, nl.obj = nl.model)
    return(result)   
}

run_replicas

In [8]:
#run_replicas <- function(nofrep,feature_names,feature_values) {
run_replicas <- function(nofrep,feature_values) {
    replicas = matrix(NA, ncol = nofrep, nrow = 1) # Save the result of each replication
    for(i in 1:nofrep){
     #   replicas[i]= run_model(feature_names,feature_values)
        replicas[i]= run_model(feature_values)
    }
    aggregated_result = mean(replicas)
    return(aggregated_result)
}

run_ABM

In [9]:
#run_ABM = function(nofrep,nofinstances,unlabeledset,featurenames = feature_names){
run_ABM = function(nofrep,nofinstances,unlabeledset){
   #unlabeledset = setcolorder(unlabeledset,featurenames) 
   unlabeledset = setcolorder(unlabeledset,feature_names) 
   for(i in 1:nofinstances){
        #unlabeledset[i, output :=  run_replicas(nofrep,featurenames, as.matrix(unlabeledset[i,]))]    
        unlabeledset[i, output :=  run_replicas(nofrep, as.matrix(unlabeledset[i,]))] 
    } 
    return(unlabeledset)
}

error functions

In [10]:
#error functions on test data
rmse_func <- function(actual, predicted){
    error = predicted - actual
    return(sqrt(mean(error^2)))
}

mape_func <- function(actual,predicted){
    return( (abs(actual - predicted)/ actual)*100 )
}

bias_func <- function(actual,predicted){
    return( (actual - predicted)/ actual )
}

#error functions on train data
obb_error_func <- function(model){
   if(model$type == "regression"){
        oob_error = model$mse[model$ntree] 
    }else if(model$type == "classification"){
        oob_error = model$err.rate 
    } 
    return(oob_error)
}

get_test_predictions

In [11]:
# prediction functions
get_test_predictions <- function(model,testset,errortype){
    
    predictedLabels <- predict(model, testset)
    predictedLabels <- cbind(testset,predictedLabels)
    setnames(predictedLabels, "predictedLabels","pred_output")

    output_variables = colnames(select(predictedLabels, contains("output")))
    # output_variables[1] = true output
    # output_variables[2] = predicted output
    
    #output_variables = colnames(predictedLabels[,1:(ncol(predictedLabels) - 2)])
    
    if(error_type == "MAPE"){
        predictedLabels[,MAPE := mapply(function(x,y) mape_func(x,y),get(output_variables[1]),get(output_variables[2]))]
          }
    if(error_type == "RMSE"){
        predictedLabels[,RMSE := mapply(function(x,y) rmse_func(x,y),get(output_variables[1]),get(output_variables[2]))]
          }
    if(error_type == "BIAS"){
        predictedLabels[,BIAS := mapply(function(x,y) bias_func(x,y),get(output_variables[1]),get(output_variables[2]))]
           } 
                                  
     output_variables_1 = predictedLabels[,get(output_variables[1]), with = TRUE]
     output_variables_2 = predictedLabels[,get(output_variables[2]), with = TRUE]
    
     performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
     performance_temp[1] =  mae(output_variables_1 , output_variables_2)
     performance_temp[2] = rmse(output_variables_1 , output_variables_2)
     performance_temp[3] = mape(output_variables_1 , output_variables_2)
    
    return(list(predictedLabels,performance_temp,output_variables))
    
}

sample_selection

In [12]:
# Adaptive sample selection function with an uncertainty measure depending on "selection_metric"
sample_selection <- function(selected_ins,unlabeled_set,model){   
    ind_pred <- t(predict(model, unlabeled_set,predict.all = TRUE)$individual) %>% 
                data.table() # predictions by each tree in the forest
    ind_pred_eval = data.table()
    
    # standard deviation calculation
    s_dev = sapply(ind_pred, sd) %>% data.table()
    setnames(s_dev,".","sd")
    ind_pred_eval = cbind(ind_pred_eval,s_dev)
    
    # range calculation
    range = sapply(ind_pred, range) %>% t() %>% data.table()
    range = range[,.(range = abs(range[,1] - range[,2]))]
    setnames(range,"range.V1","range")
    ind_pred_eval = cbind(ind_pred_eval,range)
    
    ind_pred_eval[,idx := 1:.N]
    
    if(selection_metric == "sd") {
      ind_pred_eval = ind_pred_eval[order(-sd)][1:selected_ins]
    }else if(selection_metric == "range"){
      ind_pred_eval = ind_pred_eval[order(-range)][1:selected_ins]
    }
    
    unlabeled_set[,idx := 1:.N]    
    train_candidates = unlabeled_set[ind_pred_eval$idx]
    
    return(train_candidates)
}

random_sample_selection

In [13]:
# Random sample selection
random_sample_selection <- function(selected_ins,unlabeled_set){
  
    unlabeled_set[,idx := 1:.N]
    
    train_candidate_idx = sample(unlabeled_set$idx, selected_ins, replace = FALSE, prob = NULL)   
    train_candidates = unlabeled_set[idx %in% train_candidate_idx]
    
    return(train_candidates)
}

get_variable_importance

In [14]:
get_variable_importance <- function(model){
    importances <- importance(model, type = 1, scale = FALSE)
    selected.vars <- order(importances, decreasing = TRUE)
    ranked_features = feature_names[selected.vars]
    ordered.importances <- importances[selected.vars]
    
    return(ranked_features)
}

feature_elimination

In [15]:
feature_elimination <- function(h,total_numof_eliminated_vars,ranked_features){ 
    numof_columns_left = length(ranked_features) - (total_numof_eliminated_vars + h)
    columns_left = ranked_features[1:numof_columns_left]
    
    eliminated_columns = setdiff((length(ranked_features) - total_numof_eliminated_vars), numof_columns_left)
    eliminated_columns = ranked_features[eliminated_columns]
    
    # update total_numof_eliminated_vars
    total_numof_eliminated_vars = length(ranked_features) - length(columns_left)
    
    return(list(columns_left,total_numof_eliminated_vars,h,eliminated_columns))
 }

Generate Unlabeled Data Pool

Latin hyper cube sampling

In [16]:
if(GenerateTTData == 1){
    unlabeled_pool = as.data.table(maximinLHS(n = unlabeled_ins, k = nofparams, dup = 5))
    
    unlabeled_pool$V1 = qunif(unlabeled_pool$V1, 10, 90) 
    unlabeled_pool$V2 = qunif(unlabeled_pool$V2, 10, 90) 
    setnames(unlabeled_pool, c(paste0("V",1:nofparams)), feature_names)
    
    unlabeled_pool[,idx := 1:.N]
        
    fwrite(unlabeled_pool, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/unlabeled_pool_",Sys.Date(),".csv"))
}else{
    unlabeled_pool <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/unlabeled_pool_04122019.csv")   
    unlabeled_pool <- head(unlabeled_pool[`%-similar-wanted` < 90 & `density` < 90],700) 
}
In [17]:
data_candidates =copy(unlabeled_pool)
In [18]:
pca_unlabeled_pool <- princomp(unlabeled_pool[,-c("idx")], cor = TRUE, scores = TRUE)
pca_unlabeled_pool_components <- get_pca_ind(pca_unlabeled_pool)
p_unlabeled_pool <- ggplot(data = data.table(pca_unlabeled_pool_components$coord[,1:2]), aes(x = Dim.1, y = Dim.2)) +
                    geom_point() +
                    labs( title = "") 
p_unlabeled_pool

Generate Test Set

In [19]:
if(GenerateTTData == 1){
    test_set <- head(unlabeled_pool,test_ins)
    ################## Buraya variale'ların datatipine göre bir şeyler yazılabilir
    test_set$density            = runif(test_ins, 10, 90) 
    test_set$`%-similar-wanted` = runif(test_ins, 10, 90) 
    test_set[,c("idx"):= NULL]
       
    print(paste0("ABM run start time : ",Sys.time()))
    test_set = run_ABM(nofrep,test_ins,test_set) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))
    
    fwrite(test_set, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/test_set_",Sys.Date(),".csv"))
}else{
    test_set <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/test_set_04122019.csv")  
    #below part is only for this .csv
    test_set <- head(test_set[`%-similar-wanted` < 90],800) 
    
    test_set[,idx := 1:.N]    
    test_set_idx = sample(test_set$idx, test_ins, replace = FALSE, prob = NULL)   
    test_set = test_set[idx %in% test_set_idx]
    test_set[,idx:= NULL]
}

10 10 ~ 1 min 100 10 ~ 14 min 900 * 10 ~ 09:16 -- 2019-12-03 07:54:10 +03"

In [20]:
pca_test_set <- princomp(test_set, cor = TRUE, scores = TRUE)
pca_test_set_components <- get_pca_ind(pca_test_set)
p_test_set <- ggplot(data = data.table(pca_test_set_components$coord[,1:2]), aes(x = Dim.1, y = Dim.2)) +
                    geom_point() +
                    labs( title = "") 
p_test_set

Benchmark : One-shot sampling, No feature elimination

Generate Training Set

Select a very big data pool ( nofinstances should be very high ) , like 1000

In [135]:
if(GenerateTTData == 1){
    training_set = as.data.table(maximinLHS(n = train_ins_oneshot, k = nofparams, dup = 5))
    
    training_set$V1 = qunif(training_set$V1, 10, 90) 
    training_set$V2 = qunif(training_set$V2, 10, 90) 
    setnames(training_set, c(paste0("V",1:nofparams)), feature_names)
    
    training_set$output <- 0.00
    
    print(paste0("ABM run start time : ",Sys.time()))
    training_set = run_ABM(nofrep,train_ins_oneshot,LHSample) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))  
    
    fwrite(training_set, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/training_set_",Sys.Date(),".csv"))
    
}else{
    training_set <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Data_04122019.csv")
    training_set <- head(training_set[`%-similar-wanted` < 90],700)
}
In [113]:
head(FinalTrainData_Rd)
head(training_set)
A data.table: 6 × 3
density%-similar-wantedoutput
<dbl><dbl><dbl>
47.7666067.5859899.70160
54.2428023.0359664.43247
83.6031261.8969297.98403
73.9415357.7408096.33635
29.0425362.1912599.06782
63.7140841.1094785.30121
A data.table: 6 × 3
density%-similar-wantedoutput
<dbl><dbl><dbl>
65.0115743.8793887.94083
20.7410585.2116979.24560
49.8701434.0314383.24447
33.4364622.8397168.85127
86.3674861.2024098.02329
50.0324459.8707396.84122
In [136]:
one_shot_data = copy(training_set)
In [115]:
training_set = copy(FinalTrainData_Rd)

Visualization

In [137]:
pca_training_set <- princomp(training_set[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

pca_training_set_components <- get_pca_ind(pca_training_set)
pca_training_set_components <-cbind(pca_training_set_components$coord[,1:2],training_set[,.SD, .SDcols = c("output")])
p_training_set <- ggplot(data = pca_training_set_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_training_set

Train & Test Metamodel

In [138]:
model_oneshot <- randomForest(x = training_set[, -c("output")], y = training_set$output, importance = TRUE,ntree = ntree, mtry = mtry)
model_oneshot
Call:
 randomForest(x = training_set[, -c("output")], y = training_set$output,      ntree = ntree, mtry = mtry, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 120
No. of variables tried at each split: 2

          Mean of squared residuals: 9.568796
                    % Var explained: 96.82
In [139]:
obb_error_oneshot <- obb_error_func(model_oneshot)
In [ ]:
#OBB_pred = cbind(training_set$output,model_oneshot$predicted)
#names(OBB_pred) <- c("actual","predicted")
In [140]:
plot(model_oneshot$mse, type="l")
In [141]:
test_prediction_oneshot = get_test_predictions(model_oneshot,test_set,error_type)
predictedLabels_oneshot = test_prediction_oneshot[[1]]

performance_table_oneshot = data.table(iter = numeric(), mae= numeric(),rmse= numeric(), mape = numeric())
# Keep test set error records
performance_table_oneshot = rbind(performance_table_oneshot, data.table(1, test_prediction_oneshot[[2]]), use.names = FALSE)

output_variables = test_prediction_oneshot[[3]]
In [142]:
performance_table_oneshot
obb_error_oneshot
head(predictedLabels_oneshot)
A data.table: 1 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
10.89087362.285571.29725
9.56879576954223
A data.table: 6 × 5
density%-similar-wantedoutputpred_outputRMSE
<dbl><dbl><dbl><dbl><dbl>
39.2718236.4603986.9769386.261760.715169836
28.1221068.0556499.8650499.858100.006939698
75.8097917.8004457.7748757.747860.027003440
57.7682528.6434573.8125772.940590.871973723
18.5317548.0377893.7924294.073000.280580695
80.0970142.1845582.7019184.049101.347192786
In [143]:
performance_molten_oneshot <- melt(data = performance_table_oneshot
                             , id.vars = 'iter')
setnames(performance_molten_oneshot, c("variable","value"),c("errortype","errorvalue"))
In [144]:
p_oneshot <- ggplot(predictedLabels_oneshot,aes(x = get(output_variables[1]), y = get(output_variables[2]), color = (get(output_variables[2]) - get(output_variables[1])))) +
            geom_point() +
            geom_abline() +
            xlab("actual values") +
            ylab("fitted values")

p_oneshot

Random Sampling & No Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [31]:
if(GenerateTTData == 1){
   
    training_set_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
    
    training_set_Ad$V1 = qunif(training_set_Ad$V1, 10, 90) 
    training_set_Ad$V2 = qunif(training_set_Ad$V2, 10, 90) 
    setnames(training_set_Ad, c(paste0("V",1:nofparams)), feature_names)
    training_set_Ad$output <- 0.00
    
    print(paste0("ABM run start time : ",Sys.time()))
    training_set_Ad = run_ABM(nofrep,train_ins_Ad,training_set_Ad) %>% as.data.table()
    print(paste0("ABM run end time : ",Sys.time()))
    
    fwrite(training_set_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))

}else{
    training_set_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
    training_set_Ad <- head(training_set_Ad[`%-similar-wanted` < 90  & `density` < 90],200)

}
In [32]:
adaptive_initial_data = copy(training_set_Ad)

Visualization

In [33]:
pca_training_set_Ad <- princomp(adaptive_initial_data[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],adaptive_initial_data[,.SD, .SDcols = c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_training_set_Ad

Train & Test Metamodel

In [34]:
# Decide on strategy:
#iteration_budget = 3   #specified above

## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
In [35]:
set.seed(10)
print(paste0("section start time : ",Sys.time()))
iter = 1
while(iter <= iteration_budget){   
    print(iter)

    trainx = training_set_Ad[,.SD, .SDcols = feature_names]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE,ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_",iter),model_Sub)
                     
    obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub)),use.names=FALSE)
    
    # test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub,test_set,error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub,c("pred_output",error_type), c(paste0("pred_output_",iter),paste0(error_type,"_",iter)))    
    predictedLabels_table = cbind(predictedLabels_table,predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_",iter),paste0(error_type,"_",iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table,data.table(iter,test_predictions_Sub[[2]]), use.names = FALSE)    

    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
        
    ## sample selection from unlabeled data select candidates
    unlabeled_set <- copy(unlabeled_pool)
    train_candidates = random_sample_selection(selected_ins,unlabeled_set)
        
    # Eliminate train candidates from the unlabeled pool
    unlabeled_pool = unlabeled_pool[- train_candidates$idx]
    rm(unlabeled_set)
    
    # run ABM to find outputs of train candidates
    print(paste0("ABM train_candidate run start time : ",Sys.time()))
    train_candidates = run_ABM(nofrep,selected_ins,train_candidates)
    print(paste0("ABM train_candidate run end time : ",Sys.time()))
    
    train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))

    # Add new data to train data
    training_set_Ad = rbind(training_set_Ad,train_candidates[,-c("idx")])
    }
    iter = iter + 1
}

# plot koy her iteration'da göstersin.
#setcolorder(data,variableorder) ################# bunu bi yerlere koyman gerekebilir, dikkat!!
print(paste0("section end time : ",Sys.time()))
[1] "section start time : 2020-01-09 07:31:59"
[1] 1
[1] "ABM train_candidate run start time : 2020-01-09 07:31:59"
[1] "ABM train_candidate run end time : 2020-01-09 07:32:42"
[1] 2
[1] "ABM train_candidate run start time : 2020-01-09 07:32:42"
[1] "ABM train_candidate run end time : 2020-01-09 07:33:21"
[1] 3
[1] "ABM train_candidate run start time : 2020-01-09 07:33:21"
[1] "ABM train_candidate run end time : 2020-01-09 07:34:06"
[1] 4
[1] "ABM train_candidate run start time : 2020-01-09 07:34:06"
[1] "ABM train_candidate run end time : 2020-01-09 07:34:41"
[1] 5
[1] "ABM train_candidate run start time : 2020-01-09 07:34:41"
[1] "ABM train_candidate run end time : 2020-01-09 07:35:31"
[1] 6
[1] "ABM train_candidate run start time : 2020-01-09 07:35:31"
[1] "ABM train_candidate run end time : 2020-01-09 07:36:04"
[1] 7
[1] "ABM train_candidate run start time : 2020-01-09 07:36:04"
[1] "ABM train_candidate run end time : 2020-01-09 07:36:37"
[1] 8
[1] "ABM train_candidate run start time : 2020-01-09 07:36:37"
[1] "ABM train_candidate run end time : 2020-01-09 07:37:26"
[1] 9
[1] "ABM train_candidate run start time : 2020-01-09 07:37:26"
[1] "ABM train_candidate run end time : 2020-01-09 07:38:39"
[1] 10
[1] "ABM train_candidate run start time : 2020-01-09 07:38:39"
[1] "ABM train_candidate run end time : 2020-01-09 07:39:49"
[1] 11
[1] "ABM train_candidate run start time : 2020-01-09 07:39:49"
[1] "ABM train_candidate run end time : 2020-01-09 07:40:29"
[1] 12
[1] "ABM train_candidate run start time : 2020-01-09 07:40:29"
[1] "ABM train_candidate run end time : 2020-01-09 07:41:07"
[1] 13
[1] "ABM train_candidate run start time : 2020-01-09 07:41:08"
[1] "ABM train_candidate run end time : 2020-01-09 07:42:14"
[1] 14
[1] "ABM train_candidate run start time : 2020-01-09 07:42:15"
[1] "ABM train_candidate run end time : 2020-01-09 07:42:45"
[1] 15
[1] "ABM train_candidate run start time : 2020-01-09 07:42:45"
[1] "ABM train_candidate run end time : 2020-01-09 07:43:21"
[1] 16
[1] "ABM train_candidate run start time : 2020-01-09 07:43:21"
[1] "ABM train_candidate run end time : 2020-01-09 07:44:18"
[1] 17
[1] "ABM train_candidate run start time : 2020-01-09 07:44:19"
[1] "ABM train_candidate run end time : 2020-01-09 07:45:00"
[1] 18
[1] "ABM train_candidate run start time : 2020-01-09 07:45:00"
[1] "ABM train_candidate run end time : 2020-01-09 07:45:47"
[1] 19
[1] "ABM train_candidate run start time : 2020-01-09 07:45:47"
[1] "ABM train_candidate run end time : 2020-01-09 07:46:47"
[1] 20
[1] "ABM train_candidate run start time : 2020-01-09 07:46:47"
[1] "ABM train_candidate run end time : 2020-01-09 07:47:18"
[1] 21
[1] "section end time : 2020-01-09 07:47:18"

started : 2020-01-09 07:31:59 // ended : 2020-01-09 07:47:18 // 3 nofrep 25 sample 20 selection iter = 1500 runs

In [36]:
# Final records
FinalTrainData_Rd = copy(training_set_Ad)
performance_table_Rd = copy(performance_table)
train_candidates_table_Rd  = copy(train_candidates_table)
predictedLabels_table_Rd = copy(predictedLabels_table)
obb_error_Rd = copy(obb_error)
In [37]:
#fwrite(FinalTrainData_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_Rd_BasicCode_OneShot_120tree_",Sys.Date(),".csv") )
#fwrite(performance_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_Rd_BasicCode_OneShot_120tree_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_Rd_BasicCode_OneShot_120tree_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_Rdd_BasicCode_OneShot_120tree_",Sys.Date(),".csv") )
#fwrite(obb_error_Rd,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_Rd_BasicCode_OneShot_120tree_",Sys.Date(),".csv") )
In [38]:
# show results
nrow(FinalTrainData_Rd)
performance_table_Rd 
train_candidates_table_Rd  
head(predictedLabels_table_Rd)
obb_error_Rd
700
A data.table: 21 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
12.0984314.7266303.057153
21.9814124.5700332.873256
31.7611734.2120722.544558
41.7080334.2099872.490380
51.6503864.0955142.394935
61.6902834.2758092.485206
71.5775664.1538292.326456
81.5707714.3654682.319439
91.5249814.3385662.241223
101.4775844.3005322.182432
111.4693944.4318482.176106
121.4583504.4039792.151240
131.4653074.3695262.170581
141.4262054.3182672.106937
151.4291654.4171802.119063
161.3311204.1490631.991292
171.3189584.0913221.971553
181.2860994.0863851.914302
191.2389443.9778631.852718
201.2534603.9979531.871194
211.2179533.9421701.819841
A data.table: 500 × 5
density%-similar-wantedidxoutputiter
<dbl><dbl><int><dbl><dbl>
60.3678955.42004 73 95.279551
73.3510429.16321133 72.334571
21.2538121.63017135 76.585341
71.8017920.57627143 61.708001
15.4481070.93662148100.000001
59.7987239.86570150 81.946661
52.1575618.57792151 58.676141
36.3755837.53732155 88.151601
32.4994719.08426163 65.366461
65.7572887.75657213 50.767141
40.0538625.70328243 74.737091
37.5052756.35875264 97.461381
61.2146720.74994287 63.609361
78.0484144.68347304 87.031611
59.2384213.39637327 54.034471
63.2360284.93076375 51.824981
72.7792073.39243379 97.760311
40.6387457.30135385 97.059721
16.6967052.17594436 99.464221
43.2861582.95672473 55.216061
25.1236214.93195490 72.398451
31.8452363.90615505 98.985941
85.8448488.20989542 50.415781
16.6460157.05713640 99.704811
21.9377016.78390651 73.696011
75.0797952.73906 49 94.208382
34.6003572.34013 59 99.816592
27.6374920.46809 74 71.039742
19.0249117.32867199 79.362242
18.4485058.86279205 99.275762
...............
68.8778126.2541421067.1222619
60.4626923.6073821162.9075319
28.8317113.6859422669.5136719
13.2655562.0414223399.8531619
51.5434634.7011324783.9994219
22.8865225.02460 1183.4765420
27.7966045.41240 2491.7317920
37.2722818.42808 2763.8591420
36.5900912.41078 3061.9231120
59.9486165.89318 3498.5908220
41.8380778.67310 5372.3396720
19.8482529.66990 5685.3107320
43.1096725.65882 6374.8254620
44.3220428.22947 7674.4121420
11.6194244.64752 8896.7046320
70.1350269.01784 9199.6550920
56.3512212.24678 9256.6777620
86.3859847.45947 9786.4396920
75.0080940.72460 9983.8403920
54.7581076.5828710463.7081120
29.7848138.2638410589.4829720
17.6822545.8530710794.9937320
71.1072363.1460810898.2783620
47.8616941.8662814188.7601220
50.3318820.3808715665.9204220
38.9548950.4029416297.5464620
83.9096627.7771918864.8284220
63.0259047.6594119088.5833420
40.7217142.0004519290.5471220
27.6754140.3923021892.3221720
A data.table: 6 × 45
density%-similar-wantedoutputpred_output_1RMSE_1pred_output_2RMSE_2pred_output_3RMSE_3pred_output_4...pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20pred_output_21RMSE_21
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
39.2718236.4603986.9769385.096481.8804481985.348491.62843353185.104141.8727805386.21958...86.262880.714047886.054230.9226935886.088420.8885086585.658601.3183272586.454380.52254090
28.1221068.0556499.8650499.855980.0090642999.858790.00625042599.878240.0132024199.85118...99.838790.026248199.851840.0132045499.839930.0251085999.845570.0194762999.847040.01800473
75.8097917.8004457.7748758.079410.3045396857.825700.05083679357.243850.5310173457.36777...58.261670.486802258.228400.4535339958.222330.4474647258.379530.6046605458.231390.45652110
57.7682528.6434573.8125772.920020.8925496772.715201.09736104472.717701.0948631372.74901...70.788273.024293871.181022.6315476572.386651.4259196772.396851.4157114571.973281.83928304
18.5317548.0377893.7924294.063730.2713098594.077870.28545373394.418450.6260290294.39578...94.368390.575970894.263540.4711230894.248920.4565014194.326670.5342522494.374870.58245296
80.0970142.1845582.7019185.229082.5271657485.111092.40918380186.142833.4409210284.81016...83.393630.691720583.545660.8437539583.504490.8025830983.239840.5379249583.704021.00210456
A data.table: 21 × 2
iterobb_error
<dbl><dbl>
115.080129
211.680522
3 7.997086
4 7.661824
5 7.335193
611.921274
710.506279
8 7.658411
9 7.464558
10 6.744633
11 5.575077
12 5.011367
13 5.432426
14 5.002948
15 5.007190
16 4.579059
17 4.424223
18 2.817256
19 2.983130
20 3.325246
21 2.966596
In [39]:
performance_molten_Rd <- melt(data = performance_table_Rd
                             , id.vars = 'iter')
setnames(performance_molten_Rd, c("variable","value"),c("errortype","errorvalue"))
p_Rd = ggplot(performance_molten_Rd, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
          geom_line(lwd=1) +
          geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed")
p_Rd

Final Visualization

In [40]:
pca_final_Rd_training_set <- princomp(FinalTrainData_Rd[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

pca_final_Rd_training_set_components <- get_pca_ind(pca_final_Rd_training_set)
pca_final_Rd_training_set_components <-cbind(pca_final_Rd_training_set_components$coord[,1:2],FinalTrainData_Rd[,.SD, .SDcols = c("output")])
p_final_Rd_training_set <- ggplot(data = pca_final_Rd_training_set_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_final_Rd_training_set
In [41]:
grid.arrange(p_training_set_Ad,p_final_Rd_training_set, ncol=2)

Adaptive Sampling & No Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [42]:
training_set_Ad = copy(adaptive_initial_data)
unlabeled_pool =copy(data_candidates)
In [ ]:
#if(GenerateTTData == 1){
#   
#    LHSample_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
#    
#    LHSample_Ad$V1 = qunif(LHSample_Ad$V1, 10, 90) 
#    LHSample_Ad$V2 = qunif(LHSample_Ad$V2, 10, 90) 
#    setnames(LHSample_Ad, c("V1","V2"), feature_names)
#    LHSample_Ad$output <- 0.00
#    
#    paste0("ABM run start time : ",Sys.time())
#    LHSample_Ad = run_ABM(nofrep,train_ins_Ad,LHSample_Ad) %>% as.data.table()
#    paste0("ABM run end time : ",Sys.time())
#    
#    fwrite(LHSample_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
#
#}else{
#    LHSample_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
#    LHSample_Ad <- head(LHSample_Ad[`%-similar-wanted` < 90  & `density` < 90],200)
#
#}

Visualization

In [43]:
pca_training_set_Ad <- princomp(training_set_Ad[,-c("output")], cor = TRUE, scores = TRUE)
In [44]:
#fviz_pca_ind(pca_LHSample,
#             col.ind = "cos2", # Color by the quality of representation
#             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
#              geom="point"
#             )

pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],training_set_Ad[,c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
                     geom_point(aes(colour = output)) +
                     labs( title = "", legend = "output") 
p_training_set_Ad

Train & Test Metamodel

In [ ]:
# Decide on strategy:
#iteration_budget = 3

#h = 1 # specify how many variable will be eliminated in each elimination iteration
In [45]:
## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)
In [46]:
set.seed(10)
print(paste0("section start time : ",Sys.time()))
iter = 1
while(iter <= iteration_budget){   
    print(iter)

    trainx = training_set_Ad[,.SD, .SDcols = feature_names]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE,ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_",iter),model_Sub)
                    
    obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub)),use.names=FALSE)

    # test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub,test_set,error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub,c("pred_output",error_type), c(paste0("pred_output_",iter),paste0(error_type,"_",iter)))    
    predictedLabels_table = cbind(predictedLabels_table,predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_",iter),paste0(error_type,"_",iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table,data.table(iter,test_predictions_Sub[[2]]), use.names = FALSE)
    
    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.    
    ## sample selection from unlabeled data select candidates
        unlabeled_set <- copy(unlabeled_pool)
        train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
        
        # eliminate candidates from the unlabeled pool
        unlabeled_pool = unlabeled_pool[-train_candidates$idx]
        rm(unlabeled_set)
        
        # run ABM to find outputs of train candidates
        print(paste0("ABM train_candidate run start time : ",Sys.time()))
        train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
        print(paste0("ABM train_candidate run end time : ",Sys.time()))
        
        train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
        
        # add labeled candidates to the train data
        training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
    }
    iter = iter + 1
}
print(paste0("section end time : ",Sys.time()))
[1] "section start time : 2020-01-09 07:50:25"
[1] 1
[1] "ABM train_candidate run start time : 2020-01-09 07:50:25"
[1] "ABM train_candidate run end time : 2020-01-09 07:51:31"
[1] 2
[1] "ABM train_candidate run start time : 2020-01-09 07:51:31"
[1] "ABM train_candidate run end time : 2020-01-09 07:52:11"
[1] 3
[1] "ABM train_candidate run start time : 2020-01-09 07:52:11"
[1] "ABM train_candidate run end time : 2020-01-09 07:53:23"
[1] 4
[1] "ABM train_candidate run start time : 2020-01-09 07:53:23"
[1] "ABM train_candidate run end time : 2020-01-09 07:53:56"
[1] 5
[1] "ABM train_candidate run start time : 2020-01-09 07:53:56"
[1] "ABM train_candidate run end time : 2020-01-09 07:55:01"
[1] 6
[1] "ABM train_candidate run start time : 2020-01-09 07:55:01"
[1] "ABM train_candidate run end time : 2020-01-09 07:55:36"
[1] 7
[1] "ABM train_candidate run start time : 2020-01-09 07:55:36"
[1] "ABM train_candidate run end time : 2020-01-09 07:56:24"
[1] 8
[1] "ABM train_candidate run start time : 2020-01-09 07:56:24"
[1] "ABM train_candidate run end time : 2020-01-09 07:57:03"
[1] 9
[1] "ABM train_candidate run start time : 2020-01-09 07:57:03"
[1] "ABM train_candidate run end time : 2020-01-09 07:58:08"
[1] 10
[1] "ABM train_candidate run start time : 2020-01-09 07:58:08"
[1] "ABM train_candidate run end time : 2020-01-09 07:58:57"
[1] 11
[1] "ABM train_candidate run start time : 2020-01-09 07:58:57"
[1] "ABM train_candidate run end time : 2020-01-09 07:59:27"
[1] 12
[1] "ABM train_candidate run start time : 2020-01-09 07:59:27"
[1] "ABM train_candidate run end time : 2020-01-09 08:00:03"
[1] 13
[1] "ABM train_candidate run start time : 2020-01-09 08:00:03"
[1] "ABM train_candidate run end time : 2020-01-09 08:00:36"
[1] 14
[1] "ABM train_candidate run start time : 2020-01-09 08:00:37"
[1] "ABM train_candidate run end time : 2020-01-09 08:01:05"
[1] 15
[1] "ABM train_candidate run start time : 2020-01-09 08:01:05"
[1] "ABM train_candidate run end time : 2020-01-09 08:01:38"
[1] 16
[1] "ABM train_candidate run start time : 2020-01-09 08:01:38"
[1] "ABM train_candidate run end time : 2020-01-09 08:02:13"
[1] 17
[1] "ABM train_candidate run start time : 2020-01-09 08:02:13"
[1] "ABM train_candidate run end time : 2020-01-09 08:03:06"
[1] 18
[1] "ABM train_candidate run start time : 2020-01-09 08:03:07"
[1] "ABM train_candidate run end time : 2020-01-09 08:03:56"
[1] 19
[1] "ABM train_candidate run start time : 2020-01-09 08:03:56"
[1] "ABM train_candidate run end time : 2020-01-09 08:04:34"
[1] 20
[1] "ABM train_candidate run start time : 2020-01-09 08:04:35"
[1] "ABM train_candidate run end time : 2020-01-09 08:05:12"
[1] 21
[1] "section end time : 2020-01-09 08:05:12"

started : 2020-01-09 07:50:25 // ended : 2020-01-09 08:05:12 // 3 nofrep 25 sample 20 selection iter = 1500 runs

In [47]:
# Final records
FinalTrainData_Ad = copy(training_set_Ad)
performance_table_Ad = copy(performance_table)
train_candidates_table_Ad  = copy(train_candidates_table)
predictedLabels_table_Ad = copy(predictedLabels_table)
obb_error_Ad = copy(obb_error)
In [48]:
fwrite(FinalTrainData_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_Ad_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
fwrite(performance_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_Ad_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
fwrite(train_candidates_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_Ad_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
fwrite(predictedLabels_table_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_Ad_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
fwrite(obb_error_Ad,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_Ad_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
In [49]:
nrow(FinalTrainData_Ad)
performance_table_Ad
train_candidates_table_Ad
head(predictedLabels_table_Ad)
obb_error_Ad
700
A data.table: 21 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
12.0984314.7266303.057153
21.8593754.0937942.644220
31.6208714.1089582.385732
41.4883993.7507372.198436
51.4276673.7293292.122453
61.4038663.7466272.092942
71.3536483.7414952.024827
81.3257213.7122591.974942
91.2337143.6026021.840287
101.2385213.6591661.841446
111.2095463.6049811.797331
121.2050793.6158371.791390
131.1762363.5921171.753406
141.1284573.5559401.687339
151.1422213.5721881.707218
161.1433593.6061231.709295
171.1418293.5979851.698115
181.1196853.6052471.669113
191.1349133.6027881.691683
201.1249723.5734371.668677
211.0896513.5702741.622426
A data.table: 500 × 5
density%-similar-wantedidxoutputiter
<dbl><dbl><int><dbl><dbl>
79.1863874.5911451180.491051
52.5150274.9953513699.780081
66.3243474.8561319599.690761
17.7986875.8354124099.215691
70.4786176.76088 6056.371541
77.5987877.1502234753.758661
38.0483974.8949727099.777921
20.3701275.1407852589.084381
66.0970977.6409157656.008021
22.2315689.1719456168.670531
21.6072685.9512141671.441691
49.5289675.20317 1664.672901
44.1887376.6252724766.899191
49.0230977.0017420969.104501
51.8340177.2980258164.317141
61.3530375.3873717357.126171
54.7581076.5828730665.936961
23.9634387.89434 8465.430821
82.7606876.4301515452.808051
41.2925475.9178063172.336451
88.9031074.19145 876.274341
24.9680574.6532958099.965531
32.1509676.2694846077.336481
57.4877475.7097831758.782051
19.7934883.2254864989.590641
33.8388579.3084617978.318542
20.9848480.9727117578.851162
12.2721812.10815 1990.619742
14.1575913.4271253384.396052
15.5894411.77994 8082.337982
...............
84.2135966.47530 9198.1808619
36.4993243.63156 3790.5761919
72.4786032.80387 8171.9111019
80.1184311.2414813851.1064819
74.2141511.9082115651.4560719
61.6638514.9894720554.6052220
63.7463537.62292 3380.2884020
50.6585855.6461912696.2317520
52.4417154.98247 995.9250520
88.3218845.3571519885.7286420
51.3464552.6153620396.2954320
71.0139610.8369921052.0820120
49.7505557.86407 4196.7411620
53.4031522.9846711564.5304220
53.0457057.5722619096.7316220
71.4917860.71702 6298.3702220
29.6828744.1657517391.6221720
22.7753741.7140019494.0648320
46.0782453.1911517497.0899120
45.5416558.06024 5897.1198620
73.4716682.7560917651.3538620
46.1018460.5295216598.9383020
80.3367430.65946 7071.8373720
41.2204760.1673321298.7857320
44.9659788.66144 5053.2336320
30.8708146.04277 3291.8532420
57.3104948.5632611288.7885720
48.5467851.7343910796.2121620
82.6288631.9253619673.5169520
69.0325352.7587614294.4234420
A data.table: 6 × 45
density%-similar-wantedoutputpred_output_1RMSE_1pred_output_2RMSE_2pred_output_3RMSE_3pred_output_4...pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20pred_output_21RMSE_21
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
39.2718236.4603986.9769385.096481.8804481985.087021.8899096385.353181.6237479287.29795...86.103600.8733299186.242320.7346015886.138120.8388061386.164660.8122687886.095250.88167326
28.1221068.0556499.8650499.855980.0090642999.875890.0108504999.887750.0227088299.85919...99.850920.0141197799.898780.0337326799.895240.0301983199.901650.0366041499.884040.01899542
75.8097917.8004457.7748758.079410.3045396858.068440.2935721858.072880.2980147958.01455...58.269360.4944943458.314430.5395575858.208510.4336443458.112870.3380031158.269160.49429048
57.7682528.6434573.8125772.920020.8925496772.818230.9943376072.563561.2490047172.94710...72.094881.7176866472.327811.4847583972.243091.5694802172.190361.6222052772.274691.53787091
18.5317548.0377893.7924294.063730.2713098594.104000.3115805394.274680.4822580694.59044...94.469400.6769836294.518220.7258029494.557040.7646225194.500670.7082535394.618720.82629938
80.0970142.1845582.7019185.229082.5271657485.229952.5280398085.379502.6775889385.30000...82.970700.2687846783.073680.3717697483.157000.4550908783.195410.4934966983.109180.40726512
A data.table: 21 × 2
iterobb_error
<dbl><dbl>
115.080129
214.570913
310.507075
410.546756
5 8.803361
6 8.018189
7 7.473081
8 6.942357
9 6.331387
10 6.111400
11 5.501289
12 5.338440
13 5.219288
14 5.539184
15 4.926805
16 4.932839
17 4.366652
18 4.887069
19 3.941157
20 4.168353
21 4.390004
In [50]:
performance_molten_Ad <- melt(data = performance_table_Ad
                             , id.vars = 'iter')
setnames(performance_molten_Ad, c("variable","value"),c("errortype","errorvalue"))
p_Ad = ggplot(performance_molten_Ad, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1)+
          geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed")

p_Ad

Final Visualization

In [51]:
pca_final_Ad_training_set <- princomp(FinalTrainData_Ad[,.SD, .SDcols = !c("output")], cor = TRUE, scores = TRUE)

pca_final_Ad_training_set_components <- get_pca_ind(pca_final_Ad_training_set)
pca_final_Ad_training_set_components <-cbind(pca_final_Ad_training_set_components$coord[,1:2],FinalTrainData_Ad[,.SD, .SDcols = c("output")])
p_final_Ad_training_set <- ggplot(data = pca_final_Ad_training_set_components, aes(x = Dim.1, y = Dim.2)) +
             geom_point(aes(colour = output)) +
             labs( title = "", legend = "output") 
p_final_Ad_training_set
In [52]:
grid.arrange(p_training_set_Ad,p_final_Ad_training_set, ncol=2)

3 scenarios

In [145]:
nrow(FinalTrainData_Ad)
nrow(training_set)
nrow(FinalTrainData_Rd)
700
700
700
In [146]:
oneshot_model = randomForest( x = training_set[, -c("output")]
                             ,y = training_set$output
                             , importance = TRUE,ntree = ntree, mtry = mtry)
random_model = randomForest( x = FinalTrainData_Rd[,.SD, .SDcols = feature_names]
                              ,y = FinalTrainData_Rd$output
                              ,importance = TRUE,ntree = ntree, mtry = mtry)
adaptive_model = randomForest( x = FinalTrainData_Ad[,.SD, .SDcols = feature_names]
                              ,y = FinalTrainData_Ad$output
                              ,importance = TRUE,ntree = ntree, mtry = mtry)
In [147]:
oneshot_model
random_model
adaptive_model
Call:
 randomForest(x = training_set[, -c("output")], y = training_set$output,      ntree = ntree, mtry = mtry, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 120
No. of variables tried at each split: 2

          Mean of squared residuals: 9.648594
                    % Var explained: 96.8
Call:
 randomForest(x = FinalTrainData_Rd[, .SD, .SDcols = feature_names],      y = FinalTrainData_Rd$output, ntree = ntree, mtry = mtry,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 120
No. of variables tried at each split: 2

          Mean of squared residuals: 3.34362
                    % Var explained: 98.87
Call:
 randomForest(x = FinalTrainData_Ad[, .SD, .SDcols = feature_names],      y = FinalTrainData_Ad$output, ntree = ntree, mtry = mtry,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 120
No. of variables tried at each split: 2

          Mean of squared residuals: 4.010939
                    % Var explained: 98.5
In [148]:
pred_oneshot <- predict(oneshot_model, test_set)
pred_oneshot <- cbind(test_set,pred_oneshot)

pred_rd <- predict(random_model, test_set)
pred_rd <- cbind(test_set,pred_rd)

pred_ad <- predict(adaptive_model, test_set)
pred_ad <- cbind(test_set,pred_ad)
In [149]:
pred_oneshot[,RMSE := mapply(function(x,y) rmse_func(x,y),output,pred_oneshot)]
pred_rd[,RMSE := mapply(function(x,y) rmse_func(x,y),output,pred_rd)]
pred_ad[,RMSE := mapply(function(x,y) rmse_func(x,y),output,pred_ad)]
In [150]:
rmse(pred_oneshot$output,pred_oneshot$pred_oneshot)
rmse(pred_rd$output,pred_rd$pred_rd)
rmse(pred_ad$output,pred_ad$pred_ad)
2.16127288713123
4.02974568509003
3.56785097720262
In [151]:
plot(oneshot_model$mse, type="l")
plot(random_model$mse, type="l")
plot(adaptive_model$mse, type="l")

Random Sampling vs Uncertainty Sampling

Final Data Comparison

In [124]:
grid.arrange(p_training_set, p_training_set_Ad,p_final_Rd_training_set,p_final_Ad_training_set,nrow=2, ncol=2)

Performance Comparison

In [125]:
grid.arrange(p_Rd, p_Ad, ncol=2)
In [126]:
performance_Rd_vs_Ad = rbind(performance_molten_Rd[,.(iter,errortype,errorvalue, type = "Rd")],performance_molten_Ad[,.(iter,errortype,errorvalue, type = "Ad")])
p_Rd_vs_Ad = ggplot(performance_Rd_vs_Ad, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
            facet_wrap(~type)
p_Rd_vs_Ad
In [127]:
ggplotly(p_Rd_vs_Ad)
In [63]:
comp = performance_Rd_vs_Ad[iter == 20 & errortype =="rmse"]
comp[, oneshot_error := performance_molten_oneshot[errortype =="rmse"]$errorvalue]      
comp[,diff := (errorvalue - oneshot_error) ]
comp[,diff_perc := (errorvalue - oneshot_error) / oneshot_error ]
comp
A data.table: 2 × 7
itererrortypeerrorvaluetypeoneshot_errordiffdiff_perc
<dbl><fct><dbl><chr><dbl><dbl><dbl>
20rmse3.997953Rd2.2574661.7404870.7709913
20rmse3.573437Ad2.2574661.3159710.5829418

Adaptive Sampling & Feature Elimination

Generate Training Set

Select a relatively big data pool ( nofinstances should be medium) , like 400

In [74]:
training_set_Ad = copy(adaptive_initial_data)
unlabeled_pool =copy(data_candidates)
In [ ]:
#if(GenerateTTData == 1){
#   
#    LHSample_Ad = as.data.table(maximinLHS(n = train_ins_Ad, k = nofparams, dup = 5))
#    
#    LHSample_Ad$V1 = qunif(LHSample_Ad$V1, 10, 90) 
#    LHSample_Ad$V2 = qunif(LHSample_Ad$V2, 10, 90) 
#    setnames(LHSample_Ad, c("V1","V2"), feature_names)
#    LHSample_Ad$output <- 0.00
#    
#    paste0("ABM run start time : ",Sys.time())
#    LHSample_Ad = run_ABM(nofrep,train_ins_Ad,LHSample_Ad) %>% as.data.table()
#    paste0("ABM run end time : ",Sys.time())
#    
#    fwrite(LHSample_Ad, paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data",Sys.Date(),".csv"))
#
#}else{
#    LHSample_Ad <- fread("C:/Users/paslanpatir/Desktop/TEZ_v2/LHSample_Ad_Data_04122019.csv")
#    LHSample_Ad <- head(LHSample_Ad[`%-similar-wanted` < 90  & `density` < 90],200)
#
#}

Visualization

In [68]:
pca_training_set_Ad <- princomp(training_set_Ad[,-c("output")], cor = TRUE, scores = TRUE)
In [69]:
#fviz_pca_ind(pca_LHSample,
#             col.ind = "cos2", # Color by the quality of representation
#             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
#              geom="point"
#             )

pca_training_set_Ad_components <- get_pca_ind(pca_training_set_Ad)
pca_training_set_Ad_components <-cbind(pca_training_set_Ad_components$coord[,1:2],training_set_Ad[,c("output")])
p_training_set_Ad <- ggplot(data = pca_training_set_Ad_components, aes(x = Dim.1, y = Dim.2)) +
                     geom_point(aes(colour = output)) +
                     labs( title = "", legend = "output") 
p_training_set_Ad

Train and Test Metamodel

In [75]:
# Decide on strategy:
sample_selection_iteration_order = c(1:20)
feature_elimination_iteration_order = c(20)
#iteration_budget = 3 # should be > max(max(sample_selection_iteration_order),max(feature_elimination_iteration_order))

#h = 1 # specify how many variable will be eliminated in each elimination iteration
In [76]:
feature_names
  1. 'density'
  2. '%-similar-wanted'
In [77]:
## initialize record tables Record train candidates
train_candidates_table = data.table()

# Record model performances
performance_table = data.table(iter = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())

# Record obb_error table
obb_error = data.table(iter = numeric(), obb_error = numeric())

# Record iteration history
iteration_history = data.table(iter_no = numeric(), IsFeatureEliminated = logical(), IsDataSelected = logical())

## initialize variables
# keep test set undistorted
predictedLabels_table = copy(test_set)

# specify variables(columns) to be used initialize
columns_left = feature_names
total_numof_eliminated_vars <- 0
In [78]:
set.seed(10)
print(paste0("section start time : ",Sys.time()))
iter = 1
while (iter <= iteration_budget) {
    
    trainx = training_set_Ad[, .SD, .SDcols = columns_left]
    trainy = training_set_Ad$output
    
    # Train the model
    model_Sub <- randomForest(x = trainx, y = trainy, importance = TRUE, ntree = ntree, mtry = mtry)
    assign(paste0("model_Sub_", iter), model_Sub)
    
    if (length(columns_left) == length(feature_names)) {
        ranked_features = get_variable_importance(model_Sub)
    }
    # Keep training set error records
    obb_error = rbind(obb_error, data.table(iter, obb_error_func(model_Sub)), use.names = FALSE)
    
    # Test the model on test set
    test_predictions_Sub = get_test_predictions(model_Sub, test_set, error_type)
    predictedLabels_Sub = test_predictions_Sub[[1]]
    setnames(predictedLabels_Sub, c("pred_output", error_type), c(paste0("pred_output_", iter), paste0(error_type, "_", iter)))
    predictedLabels_table = cbind(predictedLabels_table, predictedLabels_Sub[,.SD, .SDcols = c(paste0("pred_output_", iter), paste0(error_type, "_", iter))])
    
    # Keep test set error records
    performance_table = rbind(performance_table, data.table(iter, test_predictions_Sub[[2]]), use.names = FALSE)
    
    # update iteration_history
    iteration_history = rbind(iteration_history, data.table(iter, 0, 0), use.names = FALSE)
    
    if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached.
          if (iter %in% sample_selection_iteration_order) {
              ## sample selection from unlabeled data select candidates
              unlabeled_set <- copy(unlabeled_pool)
              train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
              
              # eliminate candidates from the unlabeled pool
              unlabeled_pool = unlabeled_pool[-train_candidates$idx]
              rm(unlabeled_set)
              
              # run ABM to find outputs of train candidates
              print(paste0("ABM train_candidate run start time : ",Sys.time()))
              train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
              print(paste0("ABM train_candidate run end time : ",Sys.time()))
              
              train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,iter = iter))
              
              # add labeled candidates to the train data
              training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
              
              # update iteration_history
               iteration_history[iter]$IsDataSelected= 1
          }
          if (iter %in% feature_elimination_iteration_order) {
              ## feature elimination apply feature elimination
              feature_elimination_result = feature_elimination(h, total_numof_eliminated_vars, ranked_features)
              
              columns_left = feature_elimination_result[[1]]  # 
              eliminated_columns = feature_elimination_result[[4]]  #   not necessary
              total_numof_eliminated_vars = as.numeric(feature_elimination_result[2])
              numof_eliminated_vars = as.numeric(feature_elimination_result[3])  #   not necessary 
              
              # update iteration_history
              iteration_history[iter]$IsFeatureEliminated= 1
          }
    }
iter = iter + 1  
}
print(paste0("section end time : ",Sys.time()))
[1] "section start time : 2020-01-09 08:44:58"
[1] "ABM train_candidate run start time : 2020-01-09 08:44:58"
[1] "ABM train_candidate run end time : 2020-01-09 08:46:48"
[1] "ABM train_candidate run start time : 2020-01-09 08:46:48"
[1] "ABM train_candidate run end time : 2020-01-09 08:47:30"
[1] "ABM train_candidate run start time : 2020-01-09 08:47:30"
[1] "ABM train_candidate run end time : 2020-01-09 08:48:55"
[1] "ABM train_candidate run start time : 2020-01-09 08:48:55"
[1] "ABM train_candidate run end time : 2020-01-09 08:50:07"
[1] "ABM train_candidate run start time : 2020-01-09 08:50:07"
[1] "ABM train_candidate run end time : 2020-01-09 08:50:48"
[1] "ABM train_candidate run start time : 2020-01-09 08:50:48"
[1] "ABM train_candidate run end time : 2020-01-09 08:51:25"
[1] "ABM train_candidate run start time : 2020-01-09 08:51:25"
[1] "ABM train_candidate run end time : 2020-01-09 08:51:56"
[1] "ABM train_candidate run start time : 2020-01-09 08:51:56"
[1] "ABM train_candidate run end time : 2020-01-09 08:52:48"
[1] "ABM train_candidate run start time : 2020-01-09 08:52:49"
[1] "ABM train_candidate run end time : 2020-01-09 08:53:17"
[1] "ABM train_candidate run start time : 2020-01-09 08:53:17"
[1] "ABM train_candidate run end time : 2020-01-09 08:53:46"
[1] "ABM train_candidate run start time : 2020-01-09 08:53:46"
[1] "ABM train_candidate run end time : 2020-01-09 08:54:23"
[1] "ABM train_candidate run start time : 2020-01-09 08:54:23"
[1] "ABM train_candidate run end time : 2020-01-09 08:55:15"
[1] "ABM train_candidate run start time : 2020-01-09 08:55:15"
[1] "ABM train_candidate run end time : 2020-01-09 08:55:51"
[1] "ABM train_candidate run start time : 2020-01-09 08:55:52"
[1] "ABM train_candidate run end time : 2020-01-09 08:56:58"
[1] "ABM train_candidate run start time : 2020-01-09 08:56:59"
[1] "ABM train_candidate run end time : 2020-01-09 08:57:29"
[1] "ABM train_candidate run start time : 2020-01-09 08:57:29"
[1] "ABM train_candidate run end time : 2020-01-09 08:58:36"
[1] "ABM train_candidate run start time : 2020-01-09 08:58:36"
[1] "ABM train_candidate run end time : 2020-01-09 08:59:14"
[1] "ABM train_candidate run start time : 2020-01-09 08:59:15"
[1] "ABM train_candidate run end time : 2020-01-09 09:00:12"
[1] "ABM train_candidate run start time : 2020-01-09 09:00:13"
[1] "ABM train_candidate run end time : 2020-01-09 09:00:58"
[1] "ABM train_candidate run start time : 2020-01-09 09:00:58"
[1] "ABM train_candidate run end time : 2020-01-09 09:02:11"
[1] "section end time : 2020-01-09 09:02:11"

950 runs: "section start time : 2020-01-08 21:46:52" // "section end time : 2020-01-08 22:02:30"

In [ ]:
#performance_error_table = performance_table[,.SD,.SDcols = c("iter",tolower(error_type))]
#setnames(performance_error_table,c("iter","error"))
#performance_error_table[, lag_error := shift(error,1,type = "lag")]
#performance_error_table
performance_error_table[.N]
In [79]:
columns_left
'%-similar-wanted'
In [80]:
# Final records
FinalTrainData_AdFe = copy(training_set_Ad)
performance_table_AdFe = copy(performance_table)
train_candidates_table_AdFe  = copy(train_candidates_table)
predictedLabels_table_AdFe = copy(predictedLabels_table)
obb_error_AdFe = copy(obb_error)
In [81]:
#fwrite(FinalTrainData_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/FinalTrainData_AdFe_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
#fwrite(performance_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/performance_table_AdFe_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
#fwrite(train_candidates_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/train_candidates_table_AdFe_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
#fwrite(predictedLabels_table_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/predictedLabels_table_AdFe_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
#fwrite(obb_error_AdFe,paste0("C:/Users/paslanpatir/Desktop/TEZ_v2/obb_error_AdFe_BasicCode_Oneshot_120tree_",Sys.Date(),".csv") )
In [82]:
nrow(FinalTrainData_AdFe)
performance_table_AdFe
train_candidates_table_AdFe
head(predictedLabels_table_AdFe)
obb_error_AdFe
700
A data.table: 21 × 4
itermaermsemape
<dbl><dbl><dbl><dbl>
12.0984314.7266303.057153
21.8001114.1049462.604973
31.6716694.0746452.442391
41.6162383.8295342.361563
51.5474323.7858362.266611
61.5282203.8092312.245446
71.4715423.7888962.155949
81.4123663.7995502.082855
91.3503503.6686671.982762
101.2855883.6787191.903817
111.2767183.6619091.883191
121.2338283.6504301.824850
131.2104323.6132871.795549
141.1995283.6248401.774160
151.1734913.5485421.751395
161.2044423.5944771.789988
171.1682523.6118061.741026
181.1844173.6225061.762875
191.1676533.5578431.727892
201.1810673.5538611.747526
216.1774589.9201509.183042
A data.table: 500 × 5
density%-similar-wantedidxoutputiter
<dbl><dbl><int><dbl><dbl>
79.1863874.5911451183.150401
52.5150274.9953513699.659311
66.3243474.8561319599.685541
17.7986875.8354124097.931581
70.4786176.76088 6054.220981
77.5987877.1502234753.199671
38.0483974.8949727099.817661
20.3701275.1407852597.269711
66.0970977.6409157656.939761
22.2315689.1719456169.773421
21.6072685.9512141676.287651
49.5289675.20317 1671.757751
44.1887376.6252724773.350001
49.0230977.0017420967.525251
51.8340177.2980258161.302411
61.3530375.3873717358.252681
54.7581076.5828730660.594341
23.9634387.89434 8467.962421
82.7606876.4301515452.516451
41.2925475.9178063174.753971
88.9031074.19145 878.955591
24.9680574.6532958099.811381
32.1509676.2694846079.228661
57.4877475.7097831762.596041
19.7934883.2254864979.841601
18.6606886.56811 4090.692402
17.9556888.8707367493.972842
75.2477475.3334159453.610232
33.8388579.3084617983.165902
14.1575913.4271253384.011182
...............
83.3706515.84925175 54.3735319
20.2711641.38810 72 92.9077119
83.3060948.38970249 87.5510219
83.8712319.49375219 55.8984119
48.5467851.73439121 96.8114619
80.2255059.66944144 96.7780320
80.3367430.65946 75 72.3137920
85.4858330.82709 36 72.7909020
88.7095679.96555111 51.7951420
74.5987231.15977158 72.2909120
77.9436532.22095174 73.2072120
82.6288631.92536202 73.1547520
62.2523330.23400195 73.7289520
74.2141511.90821143 51.9777020
89.5231212.55836 60 52.0013520
86.7907118.78930129 57.3378320
77.2063358.49839197 96.6993220
11.7621072.89591 57100.0000020
41.0283972.98888112 99.7796320
65.5522832.46661 10 73.7732720
23.0214344.95784 70 93.5288820
22.7753741.71400200 92.9805120
60.6908847.57684 52 88.3799020
20.8269246.90200191 93.7181320
73.4716682.75609181 52.6459020
68.6633931.01181 25 71.6845620
80.8546883.96997 54 50.6998220
83.6028084.49948222 51.8271620
72.4786032.80387 83 72.4858820
76.9274612.88288 98 52.2197920
A data.table: 6 × 45
density%-similar-wantedoutputpred_output_1RMSE_1pred_output_2RMSE_2pred_output_3RMSE_3pred_output_4...pred_output_17RMSE_17pred_output_18RMSE_18pred_output_19RMSE_19pred_output_20RMSE_20pred_output_21RMSE_21
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
39.2718236.4603986.9769385.096481.8804481984.867532.1093980184.844842.1320825086.76029...86.433230.5436926186.637800.3391301186.387330.5895954786.234340.7425883989.652682.6757550
28.1221068.0556499.8650499.855980.0090642999.830070.0349676999.829130.0359087899.85173...99.846320.0187193199.838880.0261660599.831720.0333253299.851970.0130771299.547160.3178800
75.8097917.8004457.7748758.079410.3045396858.345730.5708658658.222850.4479870257.51554...58.158410.3835437258.155560.3806906658.259730.4848610158.232530.4576636564.485676.7108004
57.7682528.6434573.8125772.920020.8925496773.081210.7313552172.869510.9430530872.68836...72.759601.0529610372.648721.1638478372.653731.1588364072.629251.1833145173.130060.6825033
18.5317548.0377893.7924294.063730.2713098594.140540.3481180394.233370.4409530394.60531...94.572010.7795913294.552580.7601613194.450800.6583853094.566700.7742829288.309765.4826549
80.0970142.1845582.7019185.229082.5271657485.235852.5339351385.077632.3757199785.18827...83.079120.3772115383.211550.5096418183.214010.5120950983.116220.4143136190.698397.9964760
A data.table: 21 × 2
iterobb_error
<dbl><dbl>
1 15.080129
2 12.999502
3 11.485931
4 11.767707
5 9.527171
6 8.325295
7 7.559233
8 8.169771
9 6.837732
10 6.896286
11 6.375266
12 5.658079
13 4.932612
14 5.164344
15 5.641964
16 5.256454
17 4.737442
18 5.299231
19 4.452053
20 4.924206
21119.262569
In [83]:
iteration_history
A data.table: 21 × 3
iter_noIsFeatureEliminatedIsDataSelected
<dbl><dbl><dbl>
101
201
301
401
501
601
701
801
901
1001
1101
1201
1301
1401
1501
1601
1701
1801
1901
2011
2100
In [84]:
performance_molten_AdFe <- melt(data = performance_table_AdFe
                             , id.vars = 'iter')
setnames(performance_molten_AdFe, c("variable","value"),c("errortype","errorvalue"))
p_AdFe = ggplot(performance_molten_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
            geom_vline(xintercept = iteration_history[IsDataSelected==1]$iter_no + 1, linetype = "dotdash",color = "yellow")
p_AdFe

Adaptive Sampling with/without Feature Elimination

In [85]:
performance_Ad_vs_AdFe = rbind(performance_molten_Ad[,.(iter,errortype,errorvalue, type = "Ad")], performance_molten_AdFe[,.(iter,errortype,errorvalue, type = "AdFe")])
p_Ad_vs_AdFe = ggplot(performance_Ad_vs_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
            geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
            facet_wrap(~type)
p_Ad_vs_AdFe
In [86]:
ggplotly(p_Ad_vs_AdFe)
In [ ]:
#head(train_candidates_table_AdFe)
#head(train_candidates_table_Ad)
In [64]:
varImpPlot(model_Sub_1)

Quit NL

In [ ]:
NLQuit(nl.obj = nl.model)
#NLQuit(all=FALSE)